knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(phytools)
## Loading required package: ape
## Loading required package: maps
library(ape)
library(readxl)
library(svglite)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
## 
##     ozone
library(RColorBrewer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange()   masks plyr::arrange()
## ✖ purrr::compact()   masks plyr::compact()
## ✖ dplyr::count()     masks plyr::count()
## ✖ dplyr::desc()      masks plyr::desc()
## ✖ dplyr::failwith()  masks plyr::failwith()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::id()        masks plyr::id()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::map()       masks maps::map()
## ✖ dplyr::mutate()    masks plyr::mutate()
## ✖ dplyr::rename()    masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ dplyr::where()     masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggpubr)
## 
## Attaching package: 'ggpubr'
## 
## The following object is masked from 'package:plyr':
## 
##     mutate
## 
## The following object is masked from 'package:ape':
## 
##     rotate
library(rstatix)
## 
## Attaching package: 'rstatix'
## 
## The following objects are masked from 'package:plyr':
## 
##     desc, mutate
## 
## The following object is masked from 'package:stats':
## 
##     filter
library(AMR) #g.test
## Registered S3 method overwritten by 'AMR':
##   method   from  
##   plot.sir igraph
library(car)#multivatiate regression model
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(jtools)#multivatiate regression model
library(interactions)#multivatiate regression model
library(tidyverse)#logistic regression
library(caret)#logistic regression
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Association of RT-qPCR RV Ct value with symptoms, age and sex of the individuals:

PCR_data_RV <- read_excel("PCR_Ct_symp_age.xlsx") 
PCR_data_RV$CollMONTH <- as.factor(substr(PCR_data_RV$CollDate,1,7))
PCR_data_RV$sex <- as.factor(PCR_data_RV$sex)
PCR_data_RV$Symptoms <- as.factor(PCR_data_RV$Symptoms)
PCR_data_RV$AgeGroup <- as.factor(PCR_data_RV$AgeGroup)
PCR_data_RV$Age <- as.numeric(PCR_data_RV$Age)
PCR_data_RV$Ct <- as.numeric(PCR_data_RV$Ct)

#analysis Ct versus age:
PCR_data_RV %>% group_by(AgeGroup) %>% get_summary_stats(Ct, type="mean_sd")
## # A tibble: 5 × 5
##   AgeGroup variable     n  mean    sd
##   <fct>    <fct>    <dbl> <dbl> <dbl>
## 1 18-65    Ct         718  27.5  5.98
## 2 5-17     Ct         217  25.6  6.20
## 3 less5    Ct         209  26.9  5.37
## 4 more65   Ct          80  28.5  5.15
## 5 <NA>     Ct           1  21.6 NA
#plotting Ct vs age groups
ordered_age_groups  = factor(PCR_data_RV$AgeGroup, levels=c("less5", "5-17", "18-65", "more65"))
ggplot(data = PCR_data_RV, mapping = aes(x=ordered_age_groups, y=Ct, fill=ordered_age_groups)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.4, alpha=0.9) 
## Warning: Removed 1 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

#Kruskal test Ct vs age:
res.age.kruskal <- kruskal_test(PCR_data_RV, Ct ~ AgeGroup)
res.age.kruskal
## # A tibble: 1 × 6
##   .y.       n statistic    df        p method        
## * <chr> <int>     <dbl> <int>    <dbl> <chr>         
## 1 Ct     1226      23.2     3 0.000037 Kruskal-Wallis
#calculate effect size
res.age.effsize <- PCR_data_RV %>% kruskal_effsize(Ct ~ AgeGroup)
res.age.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 Ct     1226  0.0165 eta2[H] small
#Wilcoxon’s test - pairwise comparisons between group:
pwc.Ct.age <- PCR_data_RV %>% wilcox_test(Ct ~ AgeGroup, p.adjust.method = "bonferroni") 
pwc.Ct.age
## # A tibble: 6 × 9
##   .y.   group1 group2    n1    n2 statistic         p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl>     <dbl>    <dbl> <chr>       
## 1 Ct    18-65  5-17     718   217    92446. 0.0000303 0.000182 ***         
## 2 Ct    18-65  less5    718   209    79129  0.229     1        ns          
## 3 Ct    18-65  more65   718    80    25732. 0.127     0.762    ns          
## 4 Ct    5-17   less5    217   209    19422  0.01      0.062    ns          
## 5 Ct    5-17   more65   217    80     6101  0.000086  0.000516 ***         
## 6 Ct    less5  more65   209    80     6903  0.022     0.131    ns
#plotting results:
pwc.Ct.age <- pwc.Ct.age %>% add_xy_position(x = "AgeGroup")
ggboxplot(PCR_data_RV, x = "AgeGroup", y = "Ct") +
    stat_pvalue_manual(pwc.Ct.age, hide.ns = TRUE) +
    labs(
        subtitle = get_test_label(res.age.kruskal, detailed = TRUE),
        caption = get_pwc_label(pwc.Ct.age)
    )
## Warning: Removed 1 rows containing non-finite values (`stat_boxplot()`).

#analysis Ct versus symptomatic:
#summary of the dataframe:
PCR_data_RV.NA <- na.omit(PCR_data_RV)

ggboxplot(PCR_data_RV.NA, x="Symptoms", y="Ct")

ggplot(data = PCR_data_RV.NA, mapping = aes(x=Symptoms, y=Ct, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.4, alpha=0.9) 

#Kruskal test
res.sym.kruskal <- kruskal_test(PCR_data_RV.NA, Ct ~ Symptoms)
res.sym.kruskal
## # A tibble: 1 × 6
##   .y.       n statistic    df        p method        
## * <chr> <int>     <dbl> <int>    <dbl> <chr>         
## 1 Ct      962      45.6     1 1.46e-11 Kruskal-Wallis
#calculate effect size
res.sym.effsize <- PCR_data_RV.NA %>% kruskal_effsize(Ct ~ Symptoms)
res.sym.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 Ct      962  0.0464 eta2[H] small
#Wilcoxon’s test to calculate pairwise comparisons between group levels with corrections for multiple testing:
pwc.ct <- PCR_data_RV.NA %>% 
  wilcox_test(Ct ~ Symptoms, p.adjust.method = "bonferroni") 
pwc.ct
## # A tibble: 1 × 7
##   .y.   group1       group2         n1    n2 statistic        p
## * <chr> <chr>        <chr>       <int> <int>     <dbl>    <dbl>
## 1 Ct    Asymptomatic Symptomatic   307   655    127664 1.46e-11
#visualization: box plots with p-values:
pwc.ct <- pwc.ct %>% add_xy_position(x = "Symptoms")
ggboxplot(PCR_data_RV.NA, x = "Symptoms", y = "Ct") +
  stat_pvalue_manual(pwc.ct, hide.ns = TRUE) +
  labs(
    subtitle = get_test_label(res.sym.kruskal, detailed = TRUE),
    caption = get_pwc_label(pwc.ct)
    )

#Age versus symptoms
PCR_data_RV.NA %>% group_by(Symptoms) %>%
          get_summary_stats(Age, type="mean_sd")
## # A tibble: 2 × 5
##   Symptoms     variable     n  mean    sd
##   <fct>        <fct>    <dbl> <dbl> <dbl>
## 1 Asymptomatic Age        307  27.2  20.6
## 2 Symptomatic  Age        655  25.7  18.6
ggboxplot(PCR_data_RV.NA, x="Symptoms", y="Age")

ggplot(data = PCR_data_RV.NA, mapping = aes(x=Symptoms, y=Age, fill=Symptoms)) + geom_violin() + geom_boxplot(width=.1, color="white") + geom_jitter(color="black", size=0.4, alpha=0.9) 

#Kruskal test
res.ageSYM.kruskal <- kruskal_test(PCR_data_RV.NA, Age ~ Symptoms)
res.ageSYM.kruskal
## # A tibble: 1 × 6
##   .y.       n statistic    df     p method        
## * <chr> <int>     <dbl> <int> <dbl> <chr>         
## 1 Age     962     0.486     1 0.486 Kruskal-Wallis
#multivariate multiple regression model to study Ct, age and symptomatic
lm1 <- lm(Ct~Symptoms+Age, data=PCR_data_RV.NA)
Anova(lm1)
## Anova Table (Type II tests)
## 
## Response: Ct
##           Sum Sq  Df F value    Pr(>F)    
## Symptoms    1565   1 47.2194 1.141e-11 ***
## Age          127   1  3.8183   0.05098 .  
## Residuals  31780 959                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm1)

#identify the formula:
summ(lm1, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
## MODEL INFO:
## Observations: 962
## Dependent Variable: Ct
## Type: OLS linear regression 
## 
## MODEL FIT:
## F(2,959) = 26.049, p = 0.000
## R² = 0.052
## Adj. R² = 0.050 
## 
## Standard errors: OLS
## ---------------------------------------------------------------------
##                               Est.     2.5%    97.5%   t val.       p
## ------------------------- -------- -------- -------- -------- -------
## (Intercept)                 27.896   27.071   28.721   66.384   0.000
## SymptomsSymptomatic         -2.738   -3.520   -1.956   -6.872   0.000
## Age                          0.019   -0.000    0.038    1.954   0.051
## ---------------------------------------------------------------------
#plot
interact_plot(lm1, pred=Age, modx=Symptoms, plot.points=TRUE, point.alpha=0.2, interval = TRUE, int.type = "confidence", int.width = .8)
## Warning: Age and Symptoms are not included in an interaction with one another in the
## model.

#verify normality of residuals
qqnorm(lm1$residuals)
qqline(lm1$residuals)

RV seasonality plots with the Washington State genomes:

#Loading the dataframe:
season_data_RV <- read_excel("RV-dataframe.xlsx") 
season_data_RV$CollMONTH <- as.factor(substr(season_data_RV$CollDate,1,7))
season_data_RV$Genotype <- as.factor(season_data_RV$Genotype)
season_data_RV$sex <- as.factor(season_data_RV$IndSex)
season_data_RV$RVSpecie <- as.factor(season_data_RV$RVSpecie)
season_data_RV$Clinica <- as.factor(season_data_RV$Clinica)
season_data_RV$StudyPeriod <- as.factor(season_data_RV$StudyPeriod)
season_data_RV$AgeGroup <- as.factor(season_data_RV$AgeGroup)
season_data_RV$IndAge <- as.numeric(season_data_RV$IndAge)
season_data_RV$Ct <- as.numeric(season_data_RV$Ct)
season_data_RV$ZIPCODE <- as.factor(season_data_RV$ZIPCODE)

# ----->Bubble chart of seasonality of RV genotypes of all the species together. It is needed an Excel file containing a column for the sequence name and a column for the genotype:
condensed_season_data_RV <- ddply(season_data_RV,.(Genotype,CollMONTH),nrow)

ggplot(condensed_season_data_RV, aes(x= factor(CollMONTH), y=Genotype, size=V1)) + geom_point(alpha=0.5) + scale_y_discrete(limits=rev(c("A1B", "A2", "A7", "A9", "A11", "A12", "A13", "A16", "A18", "A20", "A21", "A22", "A23", "A24", "A25", "A28", "A29", "A30", "A31", "A32", "A33", "A34", "A38", "A39", "A45", "A46", "A47", "A49", "A53", "A54", "A58", "A59", "A60", "A61", "A62", "A63", "A64", "A66", "A67", "A68", "A73", "A78", "A80", "A85", "A94", "A101", "A105", "B3", "B4", "B6", "B14", "B27", "B42", "B48", "B70", "B72", "B83", "B91", "B92", "B100", "B101", "Bpat107", "C1", "C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", "C13", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C23", "C25", "C26", "C27", "C28", "C29", "C31", "C33", "C34", "C35", "C36", "C40", "C42", "C43", "C44", "C46", "C53", "C55", "C56"))) + xlab("Collection Date") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 1 rows containing missing values (`geom_point()`).

# ----->Barplot of the seasonality of RV species by month of sample collection. 
n <- length(unique(season_data_RV$RVSpecie))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

ggplot(season_data_RV, aes(x=CollMONTH, fill=RVSpecie)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, n))

# ----->Barplot of the seasonality of RV-A genotypes by month of sample collection. 
season_data_RVA <- subset(season_data_RV, season_data_RV$RVSpecie=="A")
nA <- length(unique(season_data_RVA$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

ggplot(season_data_RVA, aes(x=CollMONTH, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, nA), breaks = c("A1B", "A2", "A7", "A9", "A11", "A12", "A13", "A16", "A18", "A20", "A21", "A22", "A23", "A24", "A25", "A28", "A29", "A30", "A31", "A32", "A33", "A34", "A38", "A39", "A45", "A46", "A47", "A49", "A53", "A54", "A58", "A59", "A60", "A61", "A62", "A63", "A64", "A66", "A67", "A68", "A73", "A78", "A80", "A85", "A94", "A101", "A105")) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ----->Barplot of the seasonality of RV-B genotypes by month of sample collection. 
season_data_RVB <- subset(season_data_RV, season_data_RV$RVSpecie=="B")
nB <- length(unique(season_data_RVB$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

ggplot(season_data_RVB, aes(x=CollMONTH, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, nB), breaks = c("B3", "B4", "B6", "B14", "B27", "B42", "B48", "B70", "B72", "B83", "B91", "B92", "B100", "B101", "Bpat107")) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

# ----->Barplot of the seasonality of RV-C genotypes by month of sample collection. 
season_data_RVC <- subset(season_data_RV, season_data_RV$RVSpecie=="C")
nC <- length(unique(season_data_RVC$Genotype))
qual_col_pals = brewer.pal.info[brewer.pal.info$category== 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))

ggplot(season_data_RVC, aes(x=CollMONTH, fill=Genotype)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y="Proportion", x="Collection date (month)") + scale_fill_manual(values = sample(col_vector, nC), breaks = c("C1", "C2", "C3", "C4", "C6", "C7", "C8", "C9", "C11", "C13", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C23", "C25", "C26", "C27", "C28", "C29", "C31", "C33", "C34", "C35", "C36", "C40", "C42", "C43", "C44", "C46", "C53", "C55", "C56")) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Assocaition of RV species and genotypes with with clinical and epidemiological characteristics:

#RV species versus individuals age
ggplot(season_data_RV, aes(x=RVSpecie, y=IndAge, fill=RVSpecie)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="white", alpha=0.2) + geom_jitter(color="black", size=0.4, alpha=0.9) + theme_minimal()
## Warning: Removed 477 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 477 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 477 rows containing missing values (`geom_point()`).

#RV genotypes within RV species versus individuals age
season_data_RVA <- subset(season_data_RV, subset = RVSpecie == "A")
season_data_RVB <- subset(season_data_RV, subset = RVSpecie == "B")
season_data_RVC <- subset(season_data_RV, subset = RVSpecie == "C")

ggplot(season_data_RVA, aes(x=Genotype, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 330 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 330 rows containing non-finite values (`stat_boxplot()`).

ggplot(season_data_RVB, aes(x=Genotype, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 54 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 54 rows containing non-finite values (`stat_boxplot()`).

ggplot(season_data_RVC, aes(x=Genotype, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Removed 93 rows containing non-finite values (`stat_ydensity()`).
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Warning: Removed 93 rows containing non-finite values (`stat_boxplot()`).

#Age Group and sex by RV species
df_clinica <- season_data_RV[!is.na(season_data_RV$IndAge),] %>% filter(IndSex!='U')
ggplot(df_clinica, aes(x=RVSpecie, fill=AgeGroup)) + geom_bar(position = "dodge") 

ggplot(df_clinica, aes(x=AgeGroup, fill=RVSpecie)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))

ggplot(df_clinica, aes(x=RVSpecie, fill=IndSex)) + geom_bar(position = "dodge")

#statistical test
  df_clinica_sex.spe <- as.matrix(table(df_clinica$RVSpecie, df_clinica$IndSex))  
  g.test(df_clinica_sex.spe)
## 
##  G-test of independence
## 
## data:  df_clinica_sex.spe
## X-squared = 2.38, p-value = 0.3042
  df_clinica_age.spe <- as.matrix(table(df_clinica$RVSpecie, df_clinica$AgeGroup))  
  g.test(df_clinica_age.spe)
## Warning in g.test(df_clinica_age.spe): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica_age.spe
## X-squared = 26.226, p-value = 0.000202
#Symptoms versus RV species
df_clinica.SYM <- df_clinica[!is.na(df_clinica$Clinica),]
ggplot(df_clinica.SYM, aes(x=Clinica, fill=RVSpecie)) + geom_bar(position = "dodge")

#statistical test
  df_clinica_sym.spe <- as.matrix(table(df_clinica.SYM$RVSpecie, df_clinica.SYM$Clinica))  
  g.test(df_clinica_sym.spe)
## 
##  G-test of independence
## 
## data:  df_clinica_sym.spe
## X-squared = 0.21346, p-value = 0.8988
#Symptoms by age group per RV species
df_clinica.SYMA <- subset(df_clinica.SYM, subset = RVSpecie == "A")
df_clinica.SYMB <- subset(df_clinica.SYM, subset = RVSpecie == "B")
df_clinica.SYMC <- subset(df_clinica.SYM, subset = RVSpecie == "C")

ggplot(df_clinica.SYMA, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))

    df_clinica.SYMA_sym.age <- as.matrix(table(df_clinica.SYMA$Clinica, df_clinica.SYMA$AgeGroup))  
    g.test(df_clinica.SYMA_sym.age)
## Warning in g.test(df_clinica.SYMA_sym.age): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica.SYMA_sym.age
## X-squared = 1.9512, p-value = 0.5826
ggplot(df_clinica.SYMB, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))

    df_clinica.SYMB_sym.age <- as.matrix(table(df_clinica.SYMB$Clinica, df_clinica.SYMB$AgeGroup))  
    g.test(df_clinica.SYMB_sym.age)
## Warning in g.test(df_clinica.SYMB_sym.age): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica.SYMB_sym.age
## X-squared = 2.9685, p-value = 0.3965
ggplot(df_clinica.SYMC, aes(x=AgeGroup, fill=Clinica)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))

    df_clinica.SYMC_sym.age <- as.matrix(table(df_clinica.SYMC$Clinica, df_clinica.SYMC$AgeGroup))  
    g.test(df_clinica.SYMC_sym.age)
## Warning in g.test(df_clinica.SYMC_sym.age): G-statistic approximation may be
## incorrect due to E < 5
## 
##  G-test of independence
## 
## data:  df_clinica.SYMC_sym.age
## X-squared = 5.3358, p-value = 0.1488
#logistic regressions to evaluate association with geographic location and individuals age:
season_data_RV %>% group_by(ZIPCODE) %>%
get_summary_stats(IndAge, type="mean_sd") %>% print(n = 25)
## # A tibble: 24 × 5
##    ZIPCODE variable     n  mean    sd
##    <fct>   <fct>    <dbl> <dbl> <dbl>
##  1 98001   IndAge      32  30.5 19.1 
##  2 98003   IndAge       1  41   NA   
##  3 98006   IndAge       1  44   NA   
##  4 98007   IndAge      99  30.7 20.5 
##  5 98021   IndAge       1  53   NA   
##  6 98036   IndAge      21  25.9 20.5 
##  7 98037   IndAge       3  32   31.0 
##  8 98103   IndAge       1  52   NA   
##  9 98104   IndAge       7  40.9 16.3 
## 10 98105   IndAge       1  69   NA   
## 11 98107   IndAge       4  13.2  9.18
## 12 98109   IndAge       4  33   25.6 
## 13 98116   IndAge      31  29.3 18.7 
## 14 98118   IndAge      15  28.5 21.2 
## 15 98133   IndAge     106  26.2 20.3 
## 16 98134   IndAge      43  29   17.5 
## 17 98144   IndAge       1  24   NA   
## 18 98155   IndAge       5  46.2  9.76
## 19 98195   IndAge      80  28.1 21.5 
## 20 98201   IndAge      14  22   17.6 
## 21 98272   IndAge      10  21.5 23.2 
## 22 98908   IndAge       5   9.2 15.6 
## 23 99336   IndAge       4  21.5 13.2 
## 24 <NA>    IndAge      37  12.3 17.4
#plotting
season_data_RV_clean <- season_data_RV[!is.na(season_data_RV$ZIPCODE),]

ggplot(season_data_RV_clean, aes(x=ZIPCODE, y=IndAge)) + geom_violin(width=1) + geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Groups with fewer than two data points have been dropped.
## Warning: Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.
## Groups with fewer than two data points have been dropped.

#if leaving zip code with more than 5 cases reported:
season_data_RV.5 <- season_data_RV %>%
  group_by(ZIPCODE) %>%
  filter(n() > 5) %>%
  na.omit(season_data_RV.5$ZIPCODE)

ggplot(season_data_RV.5, aes(x=ZIPCODE, y=IndAge)) +  geom_boxplot(width=0.1, color="black", alpha=0.2) + theme(axis.text.x = element_text(angle = 45, hjust = 1))

#multivariate multiple regression model to study Ct, symptoms and RV SPECIES (using data frame of sequenced samples):
season_data_RV_clean <- season_data_RV[!is.na(season_data_RV$Clinica),]

lm_reg1 <- lm(Ct~Clinica+RVSpecie, season_data_RV_clean)
Anova(lm_reg1)
## Anova Table (Type II tests)
## 
## Response: Ct
##           Sum Sq  Df F value  Pr(>F)  
## Clinica     87.4   1  5.1430 0.02381 *
## RVSpecie    38.3   2  1.1276 0.32473  
## Residuals 7661.8 451                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#plot the model
avPlots(lm_reg1)

#identify the formula:
summ(lm_reg1, confint=TRUE, digits=3) #in this case the formula is: Ct= 27.892 + 0.018 x age - 2.715 x symptomatic (0=asymptomatic, 1=symptomatic: categorical variables are choose as baseline depending on alphabetical order)
## MODEL INFO:
## Observations: 455
## Dependent Variable: Ct
## Type: OLS linear regression 
## 
## MODEL FIT:
## F(3,451) = 2.417, p = 0.066
## R² = 0.016
## Adj. R² = 0.009 
## 
## Standard errors: OLS
## --------------------------------------------------------------------
##                              Est.     2.5%    97.5%   t val.       p
## ------------------------ -------- -------- -------- -------- -------
## (Intercept)                23.525   22.674   24.376   54.344   0.000
## ClinicaSymptomatic         -1.027   -1.917   -0.137   -2.268   0.024
## RVSpecieB                   0.894   -0.318    2.105    1.450   0.148
## RVSpecieC                   0.326   -0.500    1.152    0.775   0.439
## --------------------------------------------------------------------
#verify normality of residuals
qqnorm(lm_reg1$residuals)
qqline(lm_reg1$residuals)

#Ct value and the genotypes (with more than 5 cases detected):
season_data_RV_CT.5 <- season_data_RV %>% group_by(Genotype) %>% filter(n() > 5) %>% filter(Ct!="NA")
season_data_RV_CT.5$Genotype<- as.factor(season_data_RV_CT.5$Genotype)
#Kruskal test
res.gen.ct.kruskal <- kruskal_test(season_data_RV_CT.5, season_data_RV_CT.5$Ct ~ season_data_RV_CT.5$Genotype)
res.gen.ct.kruskal
## # A tibble: 36 × 7
##    Genotype .y.                        n statistic    df          p method      
##  * <fct>    <chr>                  <int>     <dbl> <int>      <dbl> <chr>       
##  1 A101     season_data_RV_CT.5$Ct    35      84.6    35 0.00000538 Kruskal-Wal…
##  2 A11      season_data_RV_CT.5$Ct     4      84.6    35 0.00000538 Kruskal-Wal…
##  3 A1B      season_data_RV_CT.5$Ct    79      84.6    35 0.00000538 Kruskal-Wal…
##  4 A2       season_data_RV_CT.5$Ct     4      84.6    35 0.00000538 Kruskal-Wal…
##  5 A20      season_data_RV_CT.5$Ct     7      84.6    35 0.00000538 Kruskal-Wal…
##  6 A22      season_data_RV_CT.5$Ct     6      84.6    35 0.00000538 Kruskal-Wal…
##  7 A23      season_data_RV_CT.5$Ct     5      84.6    35 0.00000538 Kruskal-Wal…
##  8 A24      season_data_RV_CT.5$Ct     5      84.6    35 0.00000538 Kruskal-Wal…
##  9 A25      season_data_RV_CT.5$Ct    15      84.6    35 0.00000538 Kruskal-Wal…
## 10 A28      season_data_RV_CT.5$Ct     2      84.6    35 0.00000538 Kruskal-Wal…
## # ℹ 26 more rows
#calculate effect size
res.gen.ct.effsize <- season_data_RV %>% kruskal_effsize(Ct ~ Genotype)
res.gen.ct.effsize #(0.01- < 0.06 (small effect), 0.06 - < 0.14 (moderate effect) and >= 0.14 (large effect))
## # A tibble: 1 × 5
##   .y.       n effsize method  magnitude
## * <chr> <int>   <dbl> <chr>   <ord>    
## 1 Ct     1003  0.0869 eta2[H] moderate
ggboxplot(season_data_RV_CT.5, x = "Genotype", y = "Ct") + theme(axis.text.x = element_text(angle = 45, hjust = 1))